#! pip install -U textblob
#! python -m textblob.download_corpora
# pip install statannot
#!pip install nltk==3.3
#!pip install pyspellchecker
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import os
import seaborn as sns; sns.set(color_codes=True);
import matplotlib.pyplot as plt
import datetime
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import PorterStemmer
from textblob import Word
from textblob import TextBlob
from nltk.corpus import stopwords
import string
from nltk.stem import WordNetLemmatizer
sid = SentimentIntensityAnalyzer()
def get_vader_score(sent):
ss = sid.polarity_scores(sent)
return(ss['compound'])
stop = stopwords.words('english')
stop = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "youre", "youve", "youll", "youd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "shes", 'her', 'hers', 'herself', 'it', "its", 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "thatll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 's', 't', 'can', 'will', 'just', 'should', "shouldve", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'ma', 'mightn', 'needn', "neednt", 'shan']
from spellchecker import SpellChecker
import re
WORD = re.compile(r'\w+')
spell = SpellChecker()
def reTokenize(doc):
tokens = WORD.findall(doc)
return tokens
def spell_correct(text):
sptext = [' '.join([spell.correction(w).lower() for w in reTokenize(doc)]) for doc in text]
return sptext
from progressbar import ProgressBar
pbar = ProgressBar()
dane_polaczone = pd.DataFrame({'company' : [], 'message' : [], 'created_time' : []})
st = PorterStemmer()
nan_value = float("NaN")
for filename in pbar(os.listdir(r'.\Komentarze2')):
if filename.endswith(".xlsx"):
df = pd.read_excel(os.path.join(r'.\Komentarze2', filename))
df = df.loc[(df['level'] > 1) & (df['object_type'] == "data")]
df.replace("", nan_value, inplace=True)
df.dropna(subset = ["message"], inplace=True)
df.rename(columns={'path':'company'}, inplace=True)
df['created_time'] = df['created_time'].apply(lambda x: x[0:10])
df["company"] = df["company"].str.split("/", n = -1, expand = True)
df['message'] = df.message.astype(str)
df = df.drop_duplicates(subset=['message', 'created_time']) #zostaje
df['message'] = df['message'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['message'] = df['message'].str.replace('[^\w\s]','')
df['message'] = df['message'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
df['message'] = df['message'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
frequnp = pd.Series(' '.join(df['message']).split()).value_counts()[-200:]
frequnp = list(frequnp.index)
df['message'] = df['message'].apply(lambda x: " ".join(x for x in x.split() if x not in frequnp))
df['word_count'] = df['message'].apply(lambda x: len(str(x).split(" ")))
df = df.loc[(df['word_count'] > 2)]
dane_polaczone = dane_polaczone.append(df[['company', 'message', 'created_time']])
dane_polaczone.to_csv(r'.\Komentarze2\polaczone_firmy.csv', index=False)
dane_firmy = pd.read_csv(r'.\Komentarze2\polaczone_firmy.csv', parse_dates=['created_time'])
dane_firmy['weekno'] = dane_firmy['created_time'].dt.week
dane_firmy['month'] = (dane_firmy['created_time']).dt.strftime('%m.%Y')
dane_firmy['dayid'] = dane_firmy['company']+(dane_firmy['created_time'].apply(str))
dane_firmy['weekid'] = dane_firmy['company']+(dane_firmy['weekno'].apply(str))
dane_firmy['monthid'] = dane_firmy['company']+(dane_firmy['month'].apply(str))
dane_firmy
dane_firmy['sentiment'] = dane_firmy['message'].apply(lambda x: get_vader_score(x))
dane_firmy = dane_firmy.loc[(dane_firmy['sentiment'] != 0)]
dane_firmy
print(dane_firmy[['message', "sentiment"]].head(50))
kursy_dzienne = pd.read_excel(r'.\Kursy dzienne\kursy_dzienne.xlsx', parse_dates=['Date'], thousands=',')
kursy_dzienne['dayid'] = kursy_dzienne['company']+(kursy_dzienne['Date'].apply(str))
kursy_dzienne = kursy_dzienne[['dayid', 'company', 'Date', 'Price']]
kursy_dzienne = kursy_dzienne.sort_values(by=['dayid'])
kursy_dzienne
kursy_dzienne['rate_change'] = kursy_dzienne.groupby('company').Price.pct_change()
kursy_dzienne
dzienne_pogrupowane = dane_firmy[['dayid', 'created_time', 'sentiment']].groupby(['dayid']).mean()
dzienne_pogrupowane
dzienne_polaczone = pd.merge(dzienne_pogrupowane,
kursy_dzienne[['dayid', 'Date', 'company', 'Price', 'rate_change']],
on='dayid',
how='inner')
dzienne_polaczone
def facetgrid_two_axes(*args, **kwargs):
data = kwargs.pop('data')
dual_axis = kwargs.pop('dual_axis')
alpha = kwargs.pop('alpha', 1)
kwargs.pop('color')
ax = plt.gca()
ax.yaxis.label.set_color('tomato')
if dual_axis:
ax2 = ax.twinx()
ax.plot(data['Date'],data['Price'], **kwargs, color='tomato',alpha=alpha)
if dual_axis:
ax2.plot(data['Date'],data['sentiment'], **kwargs, color='deepskyblue',alpha=alpha)
ax2.set_ylabel('Sentiment', size=15)
ax2.yaxis.label.set_color('deepskyblue')
ax.set_ylabel('Sentiment', size=15)
ax.set_xlabel('Sentiment', size=15)
win_plot = sns.FacetGrid(dzienne_polaczone, col='company', col_wrap = 2, height=6, aspect=1.6, sharex=False, sharey=False)
(win_plot.map_dataframe(facetgrid_two_axes, dual_axis=True)
.set_xlabels("Date", size=15)
.set_ylabels("Price", size=15)
.set_titles(size=20)
)
plt.subplots_adjust(hspace=0.2, wspace=0.2)
plt.show()
dzienne_polaczone = dzienne_polaczone[~dzienne_polaczone.isin([np.nan, np.inf, -np.inf]).any(1)]
dzienne_polaczone = dzienne_polaczone[(np.abs(stats.zscore(dzienne_polaczone['sentiment'])) < 3)]
dzienne_polaczone = dzienne_polaczone[(np.abs(stats.zscore(dzienne_polaczone['rate_change'])) < 3)]
dzienne_polaczone['sent-6'] = dzienne_polaczone.groupby('company')['sentiment'].shift(-6)
dzienne_polaczone['sent-5'] = dzienne_polaczone.groupby('company')['sentiment'].shift(-5)
dzienne_polaczone['sent-4'] = dzienne_polaczone.groupby('company')['sentiment'].shift(-4)
dzienne_polaczone['sent-3'] = dzienne_polaczone.groupby('company')['sentiment'].shift(-3)
dzienne_polaczone['sent-2'] = dzienne_polaczone.groupby('company')['sentiment'].shift(-2)
dzienne_polaczone['sent-1'] = dzienne_polaczone.groupby('company')['sentiment'].shift(-1)
dzienne_polaczone['sent+1'] = dzienne_polaczone.groupby('company')['sentiment'].shift(1)
dzienne_polaczone['sent+2'] = dzienne_polaczone.groupby('company')['sentiment'].shift(2)
dzienne_polaczone['sent+3'] = dzienne_polaczone.groupby('company')['sentiment'].shift(3)
dzienne_polaczone['sent+4'] = dzienne_polaczone.groupby('company')['sentiment'].shift(4)
dzienne_polaczone['sent+5'] = dzienne_polaczone.groupby('company')['sentiment'].shift(5)
dzienne_polaczone['sent+6'] = dzienne_polaczone.groupby('company')['sentiment'].shift(6)
dzienne_polaczone
g = sns.FacetGrid(dzienne_polaczone, col='company', col_wrap = 2, height=6, aspect=1.6, sharex = False, sharey=False)
g.map(sns.regplot, "sentiment", "rate_change", color = "darkturquoise").set_xlabels("Sentiment", size=15).set_ylabels("Price", size=15).set_titles(size=20)
plt.subplots_adjust(hspace=0.2)
g = sns.FacetGrid(dzienne_polaczone[['sentiment','rate_change','company']], col='company', col_wrap=3, height=3.5, aspect=1.2, sharex = False, sharey=False)
g.map_dataframe(lambda data, color: sns.heatmap(data.corr(method='kendall'), linewidths=0, annot=True, vmin=-1, vmax=1, cmap=("Spectral_r")))
kursy_tygodniowe = pd.read_excel(r'.\Kursy tygodniowe\kursy_tygodniowe.xlsx', parse_dates=['Date'], thousands=',')
kursy_tygodniowe['weekno'] = kursy_tygodniowe['Date'].dt.week
kursy_tygodniowe['weekid'] = kursy_tygodniowe['company']+(kursy_tygodniowe['weekno'].apply(str))
kursy_tygodniowe = kursy_tygodniowe[['weekid', 'company', 'weekno', 'Price']]
kursy_tygodniowe = kursy_tygodniowe.sort_values(['company', 'weekno'], ascending=[True, True])
kursy_tygodniowe
kursy_tygodniowe['rate_change'] = kursy_tygodniowe.groupby('company').Price.pct_change()
kursy_tygodniowe
tygodniowe_pogrupowane = dane_firmy[['sentiment', 'weekno', 'weekid']].groupby('weekid').mean()
tygodniowe_pogrupowane
tygodniowe_polaczone = pd.merge(tygodniowe_pogrupowane,
kursy_tygodniowe[['Price', 'rate_change', 'weekid', 'company']],
on='weekid',
how='inner')
tygodniowe_polaczone = tygodniowe_polaczone.sort_values(['company', 'weekno'], ascending=[True, True])
tygodniowe_polaczone
tygodniowe_polaczone = tygodniowe_polaczone[~tygodniowe_polaczone.isin([np.nan, np.inf, -np.inf]).any(1)]
tygodniowe_polaczone = tygodniowe_polaczone[(np.abs(stats.zscore(tygodniowe_polaczone['sentiment'])) < 3)]
tygodniowe_polaczone = tygodniowe_polaczone[(np.abs(stats.zscore(tygodniowe_polaczone['rate_change'])) < 3)]
def facetgrid_two_axes(*args, **kwargs):
data = kwargs.pop('data')
dual_axis = kwargs.pop('dual_axis')
alpha = kwargs.pop('alpha', 1)
kwargs.pop('color')
ax = plt.gca()
ax.yaxis.label.set_color('tomato')
if dual_axis:
ax2 = ax.twinx()
ax.plot(data['weekno'],data['Price'], **kwargs, color='tomato',alpha=alpha)
if dual_axis:
ax2.plot(data['weekno'],data['sentiment'], **kwargs, color='deepskyblue',alpha=alpha)
ax2.set_ylabel('Sentiment', size=15)
ax2.yaxis.label.set_color('deepskyblue')
ax.set_ylabel('Sentiment', size=15)
ax.set_xlabel('Sentiment', size=15)
win_plot = sns.FacetGrid(tygodniowe_polaczone, col='company', col_wrap = 2, height=6, aspect=1.6, sharex=False, sharey=False)
(win_plot.map_dataframe(facetgrid_two_axes, dual_axis=True)
.set_xlabels("weekno", size=15)
.set_ylabels("Price", size=15)
.set_titles(size=20)
)
plt.subplots_adjust(hspace=0.2, wspace=0.2)
plt.show()
g = sns.FacetGrid(tygodniowe_polaczone, col='company', col_wrap = 3, height=3, aspect=1.6, sharex = False, sharey=False)
g.map(sns.regplot, "sentiment", "rate_change", color = "darkturquoise").set_xlabels("Sentiment", size=10).set_ylabels("Price", size=10).set_titles(size=12)
plt.subplots_adjust(hspace=0.2)
g = sns.FacetGrid(tygodniowe_polaczone[['sentiment','rate_change', 'weekno', 'company']], col='company', col_wrap = 3, height=3.5, aspect=1.2, sharex = False, sharey=False)
g.map_dataframe(lambda data, color: sns.heatmap(data.corr(method='kendall'), linewidths=0, annot=True, vmin=-1, vmax=1, cmap=("Spectral_r")))
kursy_miesieczne = pd.read_excel(r'.\Kursy miesieczne\kursy_miesieczne.xlsx', parse_dates=['Date'], thousands=',')
kursy_miesieczne['month'] = pd.to_datetime(kursy_miesieczne['Date'], format='%b %y').dt.strftime('%m.%Y')
kursy_miesieczne['monthid'] = kursy_miesieczne['company']+(kursy_miesieczne['month'].apply(str))
kursy_miesieczne = kursy_miesieczne[['monthid', 'company', 'month', 'Price']]
kursy_miesieczne = kursy_miesieczne.sort_values(by=['monthid'])
kursy_miesieczne
kursy_miesieczne['rate_change'] = kursy_miesieczne.groupby('company').Price.pct_change()
kursy_miesieczne
miesieczne_pogrupowane = dane_firmy[['sentiment', 'monthid']].groupby(['monthid']).mean()
miesieczne_pogrupowane
miesieczne_polaczone = pd.merge(miesieczne_pogrupowane,
kursy_miesieczne[['Price', 'rate_change', 'monthid', 'company', 'month']],
on='monthid',
how='inner')
miesieczne_polaczone = miesieczne_polaczone[~miesieczne_polaczone.isin([np.nan, np.inf, -np.inf]).any(1)]
miesieczne_polaczone = miesieczne_polaczone[(np.abs(stats.zscore(miesieczne_polaczone['sentiment'])) < 3)]
miesieczne_polaczone
def facetgrid_two_axes(*args, **kwargs):
data = kwargs.pop('data')
dual_axis = kwargs.pop('dual_axis')
alpha = kwargs.pop('alpha', 1)
kwargs.pop('color')
ax = plt.gca()
ax.yaxis.label.set_color('tomato')
if dual_axis:
ax2 = ax.twinx()
ax.plot(data['month'],data['Price'], **kwargs, color='tomato',alpha=alpha)
if dual_axis:
ax2.plot(data['month'],data['sentiment'], **kwargs, color='deepskyblue',alpha=alpha)
ax2.set_ylabel('Sentiment', size=15)
ax2.yaxis.label.set_color('deepskyblue')
ax.set_ylabel('Sentiment', size=15)
ax.set_xlabel('Sentiment', size=15)
win_plot = sns.FacetGrid(miesieczne_polaczone, col='company', col_wrap = 2, height=6, aspect=1.6, sharex=False, sharey=False)
(win_plot.map_dataframe(facetgrid_two_axes, dual_axis=True)
.set_xlabels("Month", size=15)
.set_ylabels("Price", size=15)
.set_titles(size=20)
)
plt.subplots_adjust(hspace=0.2, wspace=0.2)
g = sns.FacetGrid(miesieczne_polaczone, col='company', col_wrap = 3, height=3, aspect=1.6, sharex = False, sharey=False)
g.map(sns.regplot, "sentiment", "rate_change", color = "darkturquoise").set_xlabels("Sentiment", size=10).set_ylabels("Price", size=10).set_titles(size=12)
plt.subplots_adjust(hspace=0.2)
g = sns.FacetGrid(miesieczne_polaczone[['sentiment','rate_change','company']], col='company', col_wrap = 3, height=3.5, aspect=1.2, sharex = False, sharey=False)
g.map_dataframe(lambda data, color: sns.heatmap(data.corr(), linewidths=0, annot=True, vmin=-1, vmax=1, cmap=("Spectral_r")))
Technology = ['Adobe', 'Google', 'PayPal', 'Intel', 'Microsoft', 'AMD', 'HP', 'MotorolaUS']
Retail = ['Amazon', 'homedepot', 'Costco', 'walmart', 'ebay', 'bestbuy', 'lowes', 'target']
FMCG = ['jnj', 'proctergamble', 'PepsiUS', 'CocaColaUnitedStates', 'Colgate', 'mondelezinternational', 'KimberlyClarkCorp', 'EsteeLauder']
CommEntert = ['Disney', 'netflixus', 'verizon', 'ATT', 'TwitterInc', 'Tmobile', 'EA', 'Comcast']
df_sektory_dzien = dzienne_polaczone
df_sektory_dzien['sector'] = df_sektory_dzien['company'].apply(lambda x: "Technology" if x in Technology else "Retail" if x in Retail else "FMCG" if x in FMCG else "CommEntert" if x in CommEntert else "Other")
df_sektory_dzien
g = sns.FacetGrid(df_sektory_dzien, col='sector', col_wrap = 2, height=4, aspect=1.6, sharex = False, sharey=False)
g.map(sns.regplot, "sentiment", "rate_change", color = "darkturquoise").set_xlabels("Sentiment", size=12).set_ylabels("Price", size=12).set_titles(size=15)
plt.subplots_adjust(hspace=0.2)
g = sns.FacetGrid(df_sektory_dzien[['sentiment','rate_change','sector']], col='sector', col_wrap = 2, height=4, aspect=1.3, sharex = False, sharey=False)
g.map_dataframe(lambda data, color: sns.heatmap(data.corr(method='kendall'), linewidths=0, annot=True, vmin=-1, vmax=1, cmap=("Spectral_r")))
df_sektory_tydzien = tygodniowe_polaczone
df_sektory_tydzien['sector'] = df_sektory_tydzien['company'].apply(lambda x: "Technology" if x in Technology else "Retail" if x in Retail else "FMCG" if x in FMCG else "CommEntert" if x in CommEntert else "Other")
df_sektory_tydzien
g = sns.FacetGrid(df_sektory_tydzien, col='sector', col_wrap = 2, height=4, aspect=1.6, sharex = False, sharey=False)
g.map(sns.regplot, "sentiment", "rate_change", color = "darkturquoise").set_xlabels("Sentiment", size=12).set_ylabels("Price", size=12).set_titles(size=15)
plt.subplots_adjust(hspace=0.2)
g = sns.FacetGrid(df_sektory_tydzien[['sentiment','rate_change','weekno', 'sector']], col='sector', col_wrap = 2, height=4, aspect=1.3, sharex = False, sharey=False)
g.map_dataframe(lambda data, color: sns.heatmap(data.corr(method='kendall'), linewidths=0, annot=True, vmin=-1, vmax=1, cmap=("Spectral_r")))
df_sektory_miesiac = miesieczne_polaczone
df_sektory_miesiac['sector'] = df_sektory_miesiac['company'].apply(lambda x: "Technology" if x in Technology else "Retail" if x in Retail else "FMCG" if x in FMCG else "CommEntert" if x in CommEntert else "Other")
df_sektory_miesiac
g = sns.FacetGrid(df_sektory_miesiac, col='sector', col_wrap = 2, height=4, aspect=1.6, sharex = False, sharey=False)
g.map(sns.regplot, "sentiment", "rate_change", color = "darkturquoise").set_xlabels("Sentiment", size=12).set_ylabels("Price", size=12).set_titles(size=15)
plt.subplots_adjust(hspace=0.2)
g = sns.FacetGrid(df_sektory_miesiac[['sentiment','rate_change','sector']], col='sector', col_wrap = 2, height=4, aspect=1.3, sharex = False, sharey=False)
g.map_dataframe(lambda data, color: sns.heatmap(data.corr(method='kendall'), linewidths=0, annot=True, vmin=-1, vmax=1, cmap=("Spectral_r")))
dzienne_azbiorcza = dzienne_polaczone[~dzienne_polaczone.isin([np.nan, np.inf, -np.inf]).any(1)]
tygodniowe_azbiorcza = tygodniowe_polaczone[~tygodniowe_polaczone.isin([np.nan, np.inf, -np.inf]).any(1)]
miesieczne_azbiorcza = miesieczne_polaczone[~miesieczne_polaczone.isin([np.nan, np.inf, -np.inf]).any(1)]
f, axes = plt.subplots(1, 3, figsize =[15,5])
sns.regplot(x="sentiment", y="rate_change", data = dzienne_azbiorcza, color = "darkturquoise", ax=axes[0])
sns.regplot(x="sentiment", y="rate_change", data = tygodniowe_azbiorcza, color = "darkturquoise", ax=axes[1])
sns.regplot(x="sentiment", y="rate_change", data = miesieczne_azbiorcza, color = "darkturquoise", ax=axes[2])
f, axes = plt.subplots(1, 3, figsize =[20,5])
sns.heatmap(dzienne_polaczone[['sentiment','rate_change','company']].corr(method='kendall'), linewidths=0, annot=True, vmin=-1, vmax=1, cmap=("Spectral_r"), ax=axes[0])
sns.heatmap(tygodniowe_polaczone[['sentiment','rate_change','weekno', 'company']].corr(method='kendall'), linewidths=0, annot=True, vmin=-1, vmax=1, cmap=("Spectral_r"), ax=axes[1])
sns.heatmap(miesieczne_polaczone[['sentiment','rate_change','company']].corr(method='kendall'), linewidths=0, annot=True, vmin=-1, vmax=1, cmap=("Spectral_r"), ax=axes[2])
X = dzienne_azbiorcza[['sentiment', 'sent-6', 'sent-5', 'sent-4', 'sent-3', 'sent-2', 'sent-1', 'sent+1', 'sent+2', 'sent+3', 'sent+4', 'sent+5', 'sent+6']]
y = dzienne_azbiorcza['rate_change']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)
CART = tree.DecisionTreeRegressor(random_state=42,ccp_alpha=0.0)
CART_model=CART.fit(X_train,y_train)
path = CART.cost_complexity_pruning_path(X_train, y_train) #przycinanie drzewa
ccp_alphas, impurities = path.ccp_alphas[::10], path.impurities[::10]
fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set");
clfs = []
for ccp_alpha in ccp_alphas:
clf = tree.DecisionTreeRegressor(random_state=42, ccp_alpha=ccp_alpha)
clf.fit(X_train, y_train)
clfs.append(clf)
print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
clfs[-1].tree_.node_count, ccp_alphas[-1]))
def RMSE(model,X,y):
return np.sqrt(((model.predict(X)-y)**2).mean())
test_scores = [RMSE(clf,X_test,y_test) for clf in clfs]
train_scores = [RMSE(clf,X_train,y_train) for clf in clfs]
fig, ax = plt.subplots(figsize=[10,10])
ax.set_xlabel("alpha")
ax.set_ylabel("RMSE")
ax.set_title("RMSE vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker='o', label="train",
drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker='o', label="test",
drawstyle="steps-post")
ax.legend()
plt.show()
Best_CART = clfs[np.argmin(test_scores)]
Best_CART.ccp_alpha
feature_importance = Best_CART.feature_importances_
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
num_feat = 6
plt.figure(figsize=[10,5])
plt.barh(pos[-num_feat:], feature_importance[sorted_idx][-num_feat:], align='center')
plt.yticks(pos[-num_feat:], X_train.columns[sorted_idx][-num_feat:])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()
X = dzienne_azbiorcza[['sentiment', 'sent-6', 'sent-5', 'sent-4', 'sent-3', 'sent-2', 'sent-1', 'sent+1', 'sent+2', 'sent+3', 'sent+4', 'sent+5', 'sent+6']]
y = dzienne_azbiorcza['rate_change']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)
rfr = RandomForestRegressor
N = [10,50,100,200,300,400,500]
RMSE_RF= [RMSE(rfr(n,n_jobs=-1).fit(X_train,y_train),X_test,y_test) for n in N]
plt.plot(N,RMSE_RF,'.-',color='g');
N[np.argmin(RMSE_RF)]
features = np.linspace(1,X_train.shape[1],10).astype(int)
RMSE_RF_features= [RMSE(rfr(300,n_jobs=-1).fit(X_train,y_train),X_test,y_test) for n in features]
Best_RF = RandomForestRegressor(300,n_jobs=-1).fit(X_train,y_train)
feature_importance = Best_RF.feature_importances_
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
num_feat = 6
plt.figure(figsize=[10,5])
plt.barh(pos[-num_feat:], feature_importance[sorted_idx][-num_feat:], align='center')
plt.yticks(pos[-num_feat:], X_train.columns[sorted_idx][-num_feat:])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()
X = dzienne_azbiorcza[['sentiment', 'sent-6', 'sent-5', 'sent-4', 'sent-3', 'sent-2', 'sent-1', 'sent+1', 'sent+2', 'sent+3', 'sent+4', 'sent+5', 'sent+6']]
y = dzienne_azbiorcza['rate_change']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)
gbr = GradientBoostingRegressor
N = [10,50,100,200,300,400,500,600,700,800,900,1000]
RMSE_GBT = [RMSE(gbr(n_estimators=n).fit(X_train,y_train),X_test,y_test) for n in N]
plt.plot(N,RMSE_GBT,'.-',color='y');
N[np.argmin(RMSE_GBT)]
Best_GBT = GradientBoostingRegressor(n_estimators=10).fit(X_train,y_train)
test_score = np.zeros((10,), dtype=np.float64)
for i, y_pred in enumerate(Best_GBT.staged_predict(X_test)):
test_score[i] = Best_GBT.loss_(y_test, y_pred)
plt.figure(figsize=(10,5)) #tego nie
plt.title('Deviance')
plt.plot(np.arange(10) + 1, Best_GBT.train_score_, 'b-',
label='Training Set Deviance')
plt.plot(np.arange(10) + 1, test_score, 'r-',
label='Test Set Deviance')
plt.legend(loc='upper right')
plt.xlabel('Boosting Iterations')
plt.ylabel('Deviance');
feature_importance = Best_GBT.feature_importances_
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
num_feat = 6
plt.figure(figsize=[10,5])
plt.barh(pos[-num_feat:], feature_importance[sorted_idx][-num_feat:], align='center')
plt.yticks(pos[-num_feat:], X_train.columns[sorted_idx][-num_feat:])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()
f, axes = plt.subplots(1, 3, figsize =[15,4])
sns.heatmap(dzienne_azbiorcza[['rate_change', 'sent-3']].corr(), linewidths=0, annot=True, vmin=-1, vmax=1, cmap=("Spectral_r"), ax=axes[0])
sns.heatmap(dzienne_azbiorcza[['rate_change', 'sent+6']].corr(), linewidths=0, annot=True, vmin=-1, vmax=1, cmap=("Spectral_r"), ax=axes[1])
sns.heatmap(dzienne_azbiorcza[['rate_change', 'sent-2']].corr(), linewidths=0, annot=True, vmin=-1, vmax=1, cmap=("Spectral_r"), ax=axes[2])